/hps/nobackup/birney/users/ian/hmn_fstlibrary(here)
source(here::here("code/scripts/20210628_source.R"))date_of_collection = "20210809"
long_term_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd"
out_file = here::here("code/snakemake/20210625/config",
paste(date_of_collection, "_all_traits.tsv", sep = ""))
# Get list of all traits in database
all_traits = gwasrapidd::get_traits()
# Filter out IDs that cause an error
all_traits_tbl = all_traits@traits %>%
dplyr::filter(efo_id != "CHEBI:7916") %>%
# Save to file
readr::write_tsv(out_file)#Need to do them sequentially because the GWAS Catalog can't handle dozens (let alone thousands) of simultaneous queries.
date_of_collection = "20210809"
long_term_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd"
# create output dir
out_dir = file.path(long_term_dir,
date_of_collection,
"associations_raw")
dir.create(out_dir, recursive = T)
# Get associations
counter = 0
lapply(all_traits_tbl$efo_id, function(EFO_ID) {
counter <<- counter + 1
# set output file name
out_path = file.path(out_dir,
paste(EFO_ID, ".rds", sep = ""))
# if output file doesn't already exist, get associations and save
if (!file.exists(out_path)){
out = gwasrapidd::get_associations(efo_id = EFO_ID)
saveRDS(out, out_path)
} else {
print(paste(counter, ". ", EFO_ID, ": File already exists.", sep = ""))
}
})date_of_collection = "20210809"
long_term_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd"
out_file = here::here("code/snakemake/20210625/config",
paste(date_of_collection, "_filtered_traits.tsv", sep = ""))
all_assocs = lapply(all_traits_tbl$efo_id, function(EFO_ID){
in_path = file.path(long_term_dir,
date_of_collection,
"associations_raw",
paste(EFO_ID, ".rds", sep = ""))
readRDS(in_path)
})
names(all_assocs) = all_traits_tbl$efo_id
# filter for those with more than 50 unique SNPs
filt_assocs = all_assocs %>%
purrr::keep(function(EFO_ID){
length(unique(EFO_ID@risk_alleles$variant_id)) >= 50
})
# Filter `all_traits_tbl` for those traits and save to file
filt_traits_tbl = all_traits_tbl %>%
# and add COVID-19
dplyr::filter(efo_id %in% c(names(filt_assocs), "MONDO_0100096")) %>%
readr::write_tsv(out_file)Full Snakemake pipeline here: https://github.com/brettellebi/human_traits_fst/tree/master/code/snakemake/20210625
target_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd/20210809"
# Fst
fst_all_path = file.path(target_dir, "pegas/fst/consol/all.rds")
fst_all = readRDS(fst_all_path) %>%
lapply(., function(TRAIT_ID){
# convert to DF
TRAIT_ID %>%
data.frame(.) %>%
tibble::rownames_to_column(var = "SNP") %>%
dplyr::select(SNP, FST_PEGAS = Fst)
})
# Bind into df
fst_all_df = fst_all %>%
dplyr::bind_rows(.id = "EFO_ID") %>%
dplyr::left_join(all_traits_tbl %>%
dplyr::select(EFO_ID = efo_id,
TRAIT = trait),
by = "EFO_ID") %>%
# add `TRAIT` for control SNPs
dplyr::mutate(TRAIT = dplyr::case_when(EFO_ID == "CONTROL" ~ "control",
EFO_ID == "MONDO_0100096" ~ "COVID-19",
TRUE ~ as.character(TRAIT))
)
# Clumped SNPs
## Note that for some traits, no SNPs met the p-value threshold, so Plink1.9 did not produce a `*.clumped` file
## There are therefore fewer files than there are traits
clumped_files = list.files(file.path(target_dir, "plink/clumped"), pattern = "*.clumped", full.names = T)
clumped = lapply(clumped_files, function(FILE){
readr::read_delim(FILE,
delim = " ",
col_types = "i-c-d-------",
trim_ws = T)
})
names(clumped) = stringr::str_remove(basename(clumped_files), ".clumped")
# Filter `fst_all` for clumped SNPs
fst_clumped = lapply(names(fst_all), function(TRAIT_ID){
fst_all[[TRAIT_ID]] %>%
dplyr::filter(SNP %in% clumped[[TRAIT_ID]]$SNP)
})
names(fst_clumped) = names(fst_all)
# Bind into DF
fst_clumped_df = fst_clumped %>%
dplyr::bind_rows(.id = "EFO_ID") %>%
dplyr::left_join(all_traits_tbl %>%
dplyr::select(EFO_ID = efo_id,
TRAIT = trait),
by = "EFO_ID")
# Add CONTROL and COVID-19 SNPs
fst_clumped_df = rbind(fst_clumped_df,
fst_all_df %>%
dplyr::filter(EFO_ID == "CONTROL")
)pal_all = turbo(n = length(unique(fst_all_df$EFO_ID)))
names(pal_all) = fst_all_df %>%
dplyr::count(TRAIT) %>%
dplyr::arrange(desc(n)) %>%
dplyr::pull(TRAIT)snp_count_plot = list("PRE-CLUMP" = fst_all_df,
"POST-CLUMP" = fst_clumped_df) %>%
dplyr::bind_rows(.id = "FILTER") %>%
# remove CONTROL SNPs
dplyr::filter(TRAIT != "control") %>%
# order variables
dplyr::mutate(TRAIT = factor(TRAIT, levels = names(pal_all)),
FILTER = factor(FILTER, levels = c("PRE-CLUMP", "POST-CLUMP"))) %>%
ggplot() +
geom_bar(aes(TRAIT, fill = TRAIT)) +
scale_fill_manual(values = pal_all) +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank()) +
ylab("N LOCI") +
facet_wrap(~FILTER, nrow = 2) +
guides(fill = "none")
plotly::ggplotly(snp_count_plot,
tooltip = c("TRAIT", "y"))width = 12
height = 10
ggsave(here::here("docs/plots/20210810_snp_counts.png"),
plot = snp_count_plot,
device = "png",
width = width,
height = height,
units = "in",
dpi= 400)
ggsave(here::here("docs/plots/20210810_snp_counts.svg"),
plot = snp_count_plot,
device = "svg",
width = width,
height = height,
units = "in")fst_clumped_df %>%
dplyr::group_by(TRAIT) %>%
dplyr::summarise(MEDIAN_FST = median(FST_PEGAS, na.rm = T),
N_LOCI = dplyr::n()) %>%
dplyr::arrange(MEDIAN_FST) %>%
DT::datatable(.)## Add palette based on median Fst
od_clumped_traits = c("control",
fst_clumped_df %>%
dplyr::filter(TRAIT != "control") %>%
dplyr::group_by(TRAIT) %>%
dplyr::summarise(MEDIAN_FST = median(FST_PEGAS, na.rm = T)) %>%
dplyr::arrange(MEDIAN_FST) %>%
dplyr::pull(TRAIT)
)
pal_clump = c("#6d6a75",
turbo(n = length(od_clumped_traits) - 1)
)
names(pal_clump) = od_clumped_traitsecdf_all_faceted = fst_clumped_df %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = names(pal_clump))) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
scale_colour_manual(values = pal_clump) +
theme_bw() +
ggtitle("eCDF") +
xlab(expression(italic(F[ST]))) +
facet_wrap(~TRAIT) +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 4.5))
ggsave(here::here("docs/plots/20210810_ecdf_all_faceted.png"),
plot = ecdf_all_faceted,
device = "png",
width = 30,
height = 22,
units = "in",
dpi = 400)knitr::include_graphics(here::here("docs/plots/20210810_ecdf_all_faceted.png"))
plotly_ecdf_all = fst_clumped_df %>%
# order FST_PEGAS so that it's properly plotted by plotly
dplyr::arrange(EFO_ID, FST_PEGAS) %>%
dplyr::group_by(EFO_ID) %>%
# filter for those with >0 SNPs in the clumped dataset
dplyr::filter(EFO_ID %in% unique(fst_clumped_df$EFO_ID)) %>%
dplyr::ungroup() %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = names(pal_clump))) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
scale_colour_manual(values = pal_clump) +
theme_bw() +
ggtitle("eCDF") +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 5))
plotly::ggplotly(plotly_ecdf_all,
tooltip = c("TRAIT"))## Warning: Removed 3 rows containing non-finite values (stat_ecdf).
Select interesting traits
# get traits to look closer at
target_traits = c("control",
"body height",
"body mass index",
"mathematical ability",
"intelligence",
"self reported educational attainment",
"type ii diabetes mellitus",
"asthma",
"HIV-1 infection",
"cognitive function measurement",
"heart failure",
"diabetes mellitus",
"coronary artery disease",
"mortality",
"longevity",
"schizophrenia",
"skin pigmentation",
"skin pigmentation measurement",
"eye colour measurement",
"facial morphology measurement",
"squamous cell carcinoma",
"tuberculosis",
"cleft palate",
"neurotic disorder",
"reaction time measurement",
"endometrial carcinoma",
"BMI-adjusted hip circumference",
"alcohol dependence measurement",
"loneliness measurement",
"COVID-19"
)
# create new palette based on median Fst
od_target_traits = fst_clumped_df %>%
dplyr::filter(TRAIT %in% target_traits) %>%
dplyr::group_by(TRAIT) %>%
dplyr::summarise(MEDIAN_FST = median(FST_PEGAS, na.rm = T)) %>%
dplyr::arrange(MEDIAN_FST) %>%
dplyr::pull(TRAIT)
pal_target = c("#6d6a75",
turbo(n = length(od_target_traits) - 1)
)
names(pal_target) = od_target_traits
plotly_ecdf_select = fst_clumped_df %>%
# filter for `target_traits`
dplyr::filter(TRAIT %in% target_traits) %>%
# take unique SNPs
dplyr::group_by(EFO_ID) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::ungroup() %>%
# order FST_PEGAS so that it's properly plotted by plotly
dplyr::arrange(EFO_ID, FST_PEGAS) %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = od_target_traits)) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
scale_colour_manual(values = pal_target) +
theme_bw() +
ggtitle("eCDF") +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 5))
plotly::ggplotly(plotly_ecdf_select,
tooltip = c("TRAIT"))## Warning: Removed 2 rows containing non-finite values (stat_ecdf).
target_traits_sml = c("control",
"body height",
"body mass index",
"mathematical ability",
"intelligence",
"self reported educational attainment",
"cognitive function measurement",
"schizophrenia",
"skin pigmentation",
"skin pigmentation measurement",
"eye colour measurement",
"BMI-adjusted hip circumference"
)
# create new palette based on median Fst
od_target_traits_sml = fst_clumped_df %>%
dplyr::filter(TRAIT %in% target_traits_sml) %>%
dplyr::group_by(TRAIT) %>%
dplyr::summarise(MEDIAN_FST = median(FST_PEGAS, na.rm = T)) %>%
dplyr::arrange(MEDIAN_FST) %>%
dplyr::pull(TRAIT)
pal_target_sml = c("#6d6a75",
turbo(n = length(od_target_traits_sml) - 1)
)
names(pal_target_sml) = od_target_traits_sml
plotly_ecdf_sml = fst_clumped_df %>%
# filter for `target_traits`
dplyr::filter(TRAIT %in% target_traits_sml) %>%
# take unique SNPs
dplyr::group_by(EFO_ID) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::ungroup() %>%
# order FST_PEGAS so that it's properly plotted by plotly
dplyr::arrange(EFO_ID, FST_PEGAS) %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = od_target_traits_sml)) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
scale_colour_manual(values = pal_target_sml) +
theme_bw() +
ggtitle("eCDF") +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 5))
plotly::ggplotly(plotly_ecdf_sml,
tooltip = c("TRAIT"))## Warning: Removed 2 rows containing non-finite values (stat_ecdf).
skin pigmentationTo check whether the distribution changes if you only have a few SNPs with presumably the highest effect sizes.
# How many unique SNPs for each trait
fst_clumped_df %>%
dplyr::filter(TRAIT %in% target_traits) %>%
dplyr::group_by(TRAIT) %>%
dplyr::summarise(SNP_COUNT = n_distinct(SNP)) %>%
DT::datatable(.)# Get highest number of SNPs of the pigmentation traits
max_n_snps = fst_clumped_df %>%
dplyr::filter(TRAIT %in% target_traits) %>%
dplyr::group_by(TRAIT) %>%
dplyr::summarise(SNP_COUNT = n_distinct(SNP)) %>%
dplyr::filter(TRAIT %in% c("skin pigmentation measurement", "skin pigmentation", "eye colour measurement")) %>%
dplyr::pull(SNP_COUNT) %>%
max(.)
# Get EFO IDs for target traits
target_efo_ids = fst_clumped_df %>%
dplyr::filter(TRAIT %in% target_traits) %>%
dplyr::distinct(EFO_ID) %>%
dplyr::pull(EFO_ID)
# Filter `clumped` for `target_traits` and then pull out the top `max_n_snps` for each trait
clumped_red = clumped[names(clumped) %in% target_efo_ids] %>%
dplyr::bind_rows(.id = "EFO_ID") %>%
dplyr::arrange(EFO_ID, P) %>%
dplyr::group_by(EFO_ID) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::slice_head(n = max_n_snps) %>%
split(., f = .$EFO_ID)
# Filter `fst_all` for clumped SNPs
fst_clumped_red = fst_all[names(fst_all) %in% target_efo_ids]
clumped_red_names = names(fst_clumped_red)
fst_clumped_red = lapply(names(fst_clumped_red), function(TRAIT_ID){
fst_all[[TRAIT_ID]] %>%
dplyr::filter(SNP %in% clumped_red[[TRAIT_ID]]$SNP)
})
names(fst_clumped_red) = clumped_red_names
# Bind into DF
fst_clumped_df_red = fst_clumped_red %>%
dplyr::bind_rows(.id = "EFO_ID") %>%
dplyr::left_join(all_traits_tbl %>%
dplyr::select(EFO_ID = efo_id,
TRAIT = trait),
by = "EFO_ID")
# Plot
plotly_ecdf_select_2 = fst_clumped_df_red %>%
# order FST_PEGAS so that it's properly plotted by plotly
dplyr::arrange(EFO_ID, FST_PEGAS) %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = od_target_traits)) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
scale_colour_manual(values = pal_target) +
theme_bw() +
ggtitle("eCDF") +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 5))
plotly::ggplotly(plotly_ecdf_select_2,
tooltip = c("TRAIT"))ridges_plot = fst_clumped_df %>%
# filter for `target_traits`
dplyr::filter(TRAIT %in% target_traits) %>%
# group by trait to take unique SNPs
dplyr::group_by(TRAIT) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::ungroup() %>%
# reverse order of traits to put `body height` at the top
dplyr::mutate(TRAIT_REV = factor(TRAIT, levels = rev(names(pal_target)))) %>%
# plot
ggplot(aes(FST_PEGAS, TRAIT_REV, fill = TRAIT_REV, colour = TRAIT_REV)) +
geom_density_ridges(scale = 2,
bandwidth = 0.003,
calc_ecdf = TRUE,
quantiles = 0.5,
quantile_lines = T,
jittered_points = T,
point_shape = '|', alpha = 0.85, point_size = 2,
position = position_points_jitter(height = 0),
) +
scale_fill_manual(values = pal_target) +
scale_colour_manual(values = darker(pal_target)) +
guides(fill = "none", colour = "none") +
theme_cowplot() +
scale_y_discrete(expand = expansion(add = c(0.2, 2.3))) +
xlab(expression(italic(F[ST]))) +
ylab(NULL)
ridges_plot## Warning: Removed 2 rows containing non-finite values (stat_density_ridges).
width = 8
height = 20
ggsave(here::here("docs/plots/20210810_ridges.png"),
device = "png",
width = width,
height = height,
units = "in",
dpi= 400)
ggsave(here::here("docs/plots/20210810_ridges.svg"),
device = "svg",
width = width,
height = height,
units = "in")ridges_plot_sml = fst_clumped_df %>%
# filter for `target_traits`
dplyr::filter(TRAIT %in% target_traits_sml) %>%
# group by trait to take unique SNPs
dplyr::group_by(TRAIT) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::ungroup() %>%
# reverse order of traits to put `body height` at the top
dplyr::mutate(TRAIT_REV = factor(TRAIT, levels = rev(names(pal_target_sml)))) %>%
# plot
ggplot(aes(FST_PEGAS, TRAIT_REV, fill = TRAIT_REV, colour = TRAIT_REV)) +
geom_density_ridges(scale = 2,
bandwidth = 0.003,
calc_ecdf = TRUE,
quantiles = 0.5,
quantile_lines = T,
jittered_points = T,
point_shape = '|', alpha = 0.85, point_size = 2,
position = position_points_jitter(height = 0),
) +
scale_fill_manual(values = pal_target) +
scale_colour_manual(values = darker(pal_target)) +
guides(fill = "none", colour = "none") +
theme_cowplot() +
scale_y_discrete(expand = expansion(add = c(0.2, 2.3))) +
xlab(expression(italic(F[ST]))) +
ylab(NULL)
ridges_plot_sml## Warning: Removed 2 rows containing non-finite values (stat_density_ridges).
ecdf_plot_face = fst_clumped_df %>%
# filter for `target_traits`
dplyr::filter(TRAIT %in% target_traits) %>%
# take unique SNPs
dplyr::group_by(EFO_ID) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::ungroup() %>%
# order FST_PEGAS so that it's properly plotted by plotly
dplyr::arrange(EFO_ID, FST_PEGAS) %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = od_target_traits)) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
scale_colour_manual(values = pal_target) +
theme_cowplot(rel_small = 9/14) +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 5)) +
xlab(expression(italic(F[ST]))) +
ylab("Cumulative Probability") +
facet_wrap(~TRAIT, nrow = 6, ncol = 5)
ecdf_plot_face## Warning: Removed 2 rows containing non-finite values (stat_ecdf).
ecdf_plot_face_sml = fst_clumped_df %>%
# filter for `target_traits`
dplyr::filter(TRAIT %in% target_traits_sml) %>%
# take unique SNPs
dplyr::group_by(EFO_ID) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::ungroup() %>%
# order FST_PEGAS so that it's properly plotted by plotly
dplyr::arrange(EFO_ID, FST_PEGAS) %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = names(pal_target_sml))) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
scale_colour_manual(values = pal_target_sml) +
theme_cowplot(rel_small = 9/14) +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 5)) +
xlab(expression(italic(F[ST]))) +
ylab("Cumulative Probability") +
facet_wrap(~TRAIT, nrow = 3, ncol = 4)
ecdf_plot_face_sml## Warning: Removed 2 rows containing non-finite values (stat_ecdf).
ecdf_plot_all = fst_clumped_df %>%
# filter for `target_traits`
dplyr::filter(TRAIT %in% target_traits) %>%
# take unique SNPs
dplyr::group_by(EFO_ID) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::ungroup() %>%
# order FST_PEGAS so that it's properly plotted by plotly
dplyr::arrange(EFO_ID, FST_PEGAS) %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = names(pal_clump)[names(pal_target) %in% target_traits])) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
scale_colour_manual(values = pal_target) +
theme_cowplot() +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 5)) +
xlab(expression(italic(F[ST]))) +
ylab("Cumulative Probability")
ecdf_plot_all## Warning: Removed 2 rows containing non-finite values (stat_ecdf).
final_figure = cowplot::ggdraw() +
draw_plot(ridges_plot,
x = 0, y = 0, width = .5, height = 1) +
draw_plot(ecdf_plot_face,
x = .5, y = 0.35, width = .5, height = .65) +
draw_plot(ecdf_plot_all,
x = .5, y = 0, width = .5, height = .35) +
draw_plot_label(label = c("A", "B", "C"), size = 25,
x = c(0, .5, .5), y = c(1, 1, .35),
hjust = c(-.15, .15, .15),
color = "#37323E")## Warning: Removed 2 rows containing non-finite values (stat_density_ridges).
## Warning: Removed 2 rows containing non-finite values (stat_ecdf).
## Warning: Removed 2 rows containing non-finite values (stat_ecdf).
width = 12
height = 12
ggsave(here::here("docs/plots/20210810_final.png"),
final_figure,
device = "png",
width = width,
height = height,
units = "in",
dpi= 400)
ggsave(here::here("docs/plots/20210810_final.svg"),
final_figure,
device = "svg",
width = width,
height = height,
units = "in")knitr::include_graphics(here::here("docs/plots/20210810_final.png"))